import pandas as pd

print(pd.__version__)

1.4.4


mydict = {
  'student': ["Arnab", "Mainak", "Kunal","Sourav"],
  'Age': [24,25,25,26]
}
type(mydict)

dict


mydat= pd.DataFrame(mydict)
mydat


type(mydat)

pandas.core.frame.DataFrame


import numpy as np


# Here, np.arange(0,20).reshape(5,4) is creating a 2D array i.e., a matrix with dim (5x4).

df= pd.DataFrame(np.arange(0,20).reshape(5,4),index=None,columns=["Colm1","Colm2","Colm3","Colm4"])
print(df)
type(df)

   Colm1  Colm2  Colm3  Colm4
0      0      1      2      3
1      4      5      6      7
2      8      9     10     11
3     12     13     14     15
4     16     17     18     19

pandas.core.frame.DataFrame


a=[1,2,3,4,5]
s1= pd.Series(a)
s1

0    1
1    2
2    3
3    4
4    5
dtype: int64


type(s1)

pandas.core.series.Series


df.loc[0] ##1st row of the df

Colm1    0
Colm2    1
Colm3    2
Colm4    3
Name: 0, dtype: int32


type(df.loc[0])

pandas.core.series.Series


df.loc[[0,1]]


type(df.loc[[0,1]])

pandas.core.frame.DataFrame


df.iloc[0,0] ## (1,1) element of df

0


df.iloc[0:2,0:3] ## 1st 2 rows and three columns


type(df.iloc[0:2,0:3])

pandas.core.frame.DataFrame


df.iloc[:,0]

0     0
1     4
2     8
3    12
4    16
Name: Colm1, dtype: int32


type(df.iloc[:,0])

pandas.core.series.Series


df[0:2] # 1st two rows


df["Colm1"]

0     0
1     4
2     8
3    12
4    16
Name: Colm1, dtype: int32


df[["Colm1","Colm2"]]


df.values

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19]])


type(df.values)

numpy.ndarray


df["Colm1"].value_counts()

0     1
4     1
8     1
12    1
16    1
Name: Colm1, dtype: int64


type(df["Colm1"].value_counts())

pandas.core.series.Series


df[["Colm1","Colm2"]].value_counts()

Colm1  Colm2
0      1        1
4      5        1
8      9        1
12     13       1
16     17       1
dtype: int64


df2= pd.DataFrame(np.array([[1,2,3,4],
                          [2,4,7,9],
                          [6,4,5,7],
                          [5,5,7,9]]),columns=["colm1","colm2","colm3","colm4"])
df2


df2[["colm2","colm4"]].value_counts()

colm2  colm4
2      4        1
4      7        1
       9        1
5      9        1
dtype: int64


df.shape

(5, 4)


df2.shape

(4, 4)


df.isnull().sum() ## For NULL values

Colm1    0
Colm2    0
Colm3    0
Colm4    0
dtype: int64


df.isna().sum()

Colm1    0
Colm2    0
Colm3    0
Colm4    0
dtype: int64


df3= pd.DataFrame(np.array([[1,np.NaN,3,4],
                          [2,4,np.NaN,9],
                          [6,4,5,7],
                          [5,5,7,9]]),columns=["colm1","colm2","colm3","colm4"])
df3


df3.isna().sum()

colm1    0
colm2    1
colm3    1
colm4    0
dtype: int64


df3.isnull().sum()

colm1    0
colm2    1
colm3    1
colm4    0
dtype: int64


iris=pd.read_csv("D:/Users/User/Downloads/iris.csv")

iris.head(5)


iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Sepal.Length  150 non-null    float64
 1   Sepal.Width   150 non-null    float64
 2   Petal.Length  150 non-null    float64
 3   Petal.Width   150 non-null    float64
 4   Species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


df3.dropna()


index = [('California', 2000), ('California', 2010),  ## Data index
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]

populations = [33871648, 37253956,                   ## Data vales
               18976457, 19378102,
               20851820, 25145561]


pop = pd.Series(populations, index=index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64


# Now, if we want to access all data for 2010

pop[[i for i in pop.index if i[1] == 2010]]

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64


index = pd.MultiIndex.from_tuples(index)

index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )


pop= pop.reindex(index)

pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64


# Now to access all data for which the second index is 2010

pop[:,2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64


## Concatenating two series

ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object


df1 = pd.DataFrame([["A1","B1"],
                    ["A2","B2"]],index=[1,2],columns=["A","B"])
print("df1:\n",df1,"\n")

df2 = pd.DataFrame([["A3","B4"],
                    ["A5","B6"]],index=[3,4],columns=["A","B"])
print("df2:\n",df2,"\n")


print("Concatenated Data:\n",pd.concat([df1, df2]))

df1:
     A   B
1  A1  B1
2  A2  B2 

df2:
     A   B
3  A3  B4
4  A5  B6 

Concatenated Data:
     A   B
1  A1  B1
2  A2  B2
3  A3  B4
4  A5  B6


df1 = pd.DataFrame([["A1","B1"],
                    ["A2","B2"]],index=[1,2],columns=["A","B"])
print("df1:\n",df1,"\n")

df2 = pd.DataFrame([["C1","D1"],
                    ["C2","D2"]],index=[1,2],columns=["C","D"])
print("df2:\n",df2,"\n")


print("Concatenated Data:\n",pd.concat([df1, df2],axis=1))

df1:
     A   B
1  A1  B1
2  A2  B2 

df2:
     C   D
1  C1  D1
2  C2  D2 

Concatenated Data:
     A   B   C   D
1  A1  B1  C1  D1
2  A2  B2  C2  D2


iris.iloc[:,:4].sum()/iris.shape[0]  ## Mean

Sepal.Length    5.843333
Sepal.Width     3.057333
Petal.Length    3.758000
Petal.Width     1.199333
dtype: float64


iris.iloc[:,:4].mean()            ## Mean

Sepal.Length    5.843333
Sepal.Width     3.057333
Petal.Length    3.758000
Petal.Width     1.199333
dtype: float64


iris.iloc[:,:4].describe()   ## Summary


iris.groupby("Species").min()


iris.groupby("Species")["Sepal.Length"].describe()

	colm1	colm2	colm3	colm4
0	1.0	NaN	3.0	4.0
1	2.0	4.0	NaN	9.0
2	6.0	4.0	5.0	7.0
3	5.0	5.0	7.0	9.0

	Sepal.Length	Sepal.Width	Petal.Length	Petal.Width	Species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa

	colm1	colm2	colm3	colm4
2	6.0	4.0	5.0	7.0
3	5.0	5.0	7.0	9.0

Function	Description
`count()`	Total number of items
`first()`,`last()`	First and last item
`mean()`,`median()`	Mean and Median
`min()`, `max()`	Minimum and Maximum
`std()`, `var()`	Standard deviation and Variance
`mad()`	Mean absolute deviation
`prod()`	Product of all items
`sum()`	Sum of all items
`describe()`	Summary of Dataframe or Series

	Sepal.Length	Sepal.Width	Petal.Length	Petal.Width
count	150.000000	150.000000	150.000000	150.000000
mean	5.843333	3.057333	3.758000	1.199333
std	0.828066	0.435866	1.765298	0.762238
min	4.300000	2.000000	1.000000	0.100000
25%	5.100000	2.800000	1.600000	0.300000
50%	5.800000	3.000000	4.350000	1.300000
75%	6.400000	3.300000	5.100000	1.800000
max	7.900000	4.400000	6.900000	2.500000

Introduction to Pandas¶

Installation of Pandas¶

Import Pandas¶

Checking Pandas Version¶

Create a simple data frame¶

Pandas Series¶

Accessing elements of a Data frame¶

Extracting some columns And Rows¶

Converting dataframe to array¶

Unique Values in a column¶

Dimension or Shape of the Dataframe¶

Checking the NULL values or NA values¶

Read .CSV file¶

Data Info¶

Remove rows containing NA¶

Multidimensional Indexing¶

Combining Datasets: Concat¶

Simple Concatenation with pd.concat¶

(A) Concatenating two dataframe row-wise¶

(B) Concatenating two dataframe column-wise¶

Aggregation and Grouping¶

GroupBy¶

	Sepal.Length	Sepal.Width	Petal.Length	Petal.Width
Species
setosa	4.3	2.3	1.0	0.1
versicolor	4.9	2.0	3.0	1.0
virginica	4.9	2.2	4.5	1.4

	count	mean	std	min	25%	50%	75%	max
Species
setosa	50.0	5.006	0.352490	4.3	4.800	5.0	5.2	5.8
versicolor	50.0	5.936	0.516171	4.9	5.600	5.9	6.3	7.0
virginica	50.0	6.588	0.635880	4.9	6.225	6.5	6.9	7.9

	student	Age
0	Arnab	24
1	Mainak	25
2	Kunal	25
3	Sourav	26